This notebook defines the most focal recurrent copy number units by removing focal changes that are within entire chromosome arm losses and gains. Most focal here meaning:

  • If a chromosome arm is not clearly defined as a gain or loss (and is callable) we look to define the cytoband level status
  • If a cytoband is not clearly defined as a gain or loss (and is callable) we then look to define the gene-level status

Usage

This notebook is intended to be run from the command line with the following (assumes you are in the root directory of the repository):

Rscript -e "rmarkdown::render('analyses/focal-cn-file-preparation/05-define-most-focal-cn-units.Rmd', clean = TRUE)"

Cutoffs:

# The fraction of calls a particular status needs to be
# above to be called the majority status -- the decision
# for a cutoff of 90% here was made to ensure that the status
# is not only the majority status but it is also significantly
# called more than the other status values in the region
status_threshold <- 0.9

# The fraction threshold for determining if enough of a region
# (arm, cytoband, or gene) is callable to determine its status --
# the decision for a cutoff of 50% here was made as it seems reasonable
# to expect a region to be more than 50% callable for a dominant status
# call to be made
uncallable_threshold <- 0.5

Set up

Libraries and functions

library(tidyverse)
── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.0     ✔ purrr   0.3.2
✔ tibble  2.1.3     ✔ dplyr   0.8.3
✔ tidyr   0.8.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Files and directories

results_dir <- "results"

Read in files

Read in cytoband status file and format it for what we will need in this notebook.

# Read in the file with consensus CN status data and the UCSC cytoband data --
# generated in `03-add-cytoband-status-consensus.Rmd`
consensus_seg_cytoband_status_df <-
  read_tsv(file.path("results", "consensus_seg_with_ucsc_cytoband_status.tsv.gz")) %>%
  # Need this to not have `chr`
  mutate(
    chr = gsub("chr", "", chr),
    cytoband = paste0(chr, cytoband)
  ) %>%
  select(
    chromosome_arm,
    # Distinguish this dominant status that is based on cytobands, from the status
    dominant_cytoband_status = dominant_status,
    cytoband,
    Kids_First_Biospecimen_ID,
    band_length,
    gain_fraction,
    loss_fraction,
    callable_fraction
  )
Parsed with column specification:
cols(
  Kids_First_Biospecimen_ID = col_character(),
  chr = col_character(),
  cytoband = col_character(),
  dominant_status = col_character(),
  band_length = col_double(),
  callable_fraction = col_double(),
  gain_fraction = col_double(),
  loss_fraction = col_double(),
  chromosome_arm = col_character()
)

Read in the gene-level data.

# Read in the annotated gene level CN file
consensus_seg_autosomes_df <-
  read_tsv(file.path(results_dir, "consensus_seg_annotated_cn_autosomes.tsv.gz")) %>%
  mutate(chromosome_arm = gsub("(p|q).*", "\\1", cytoband))
Parsed with column specification:
cols(
  biospecimen_id = col_character(),
  status = col_character(),
  copy_number = col_double(),
  ploidy = col_double(),
  ensembl = col_character(),
  gene_symbol = col_character(),
  cytoband = col_character()
)
# Rename "amplification" status calls to be "gain" for the purpose of this script
consensus_seg_autosomes_df$status <-
  gsub("amplification", "gain", consensus_seg_autosomes_df$status)

Define most focal units

Determine chromosome arm status

consensus_seg_arm_status <- consensus_seg_cytoband_status_df %>%
    # Group by biospecimen ID and region
    group_by(Kids_First_Biospecimen_ID, chromosome_arm) %>%
    # Summarize the weighted means for each status
    summarize(
      loss_fraction_arm = weighted.mean(loss_fraction, band_length),
      gain_fraction_arm = weighted.mean(gain_fraction, band_length),
      callable_fraction_arm = weighted.mean(callable_fraction, band_length)
    ) %>%
    # Define dominant status based the weighted means meeting a status
    # threshold
    mutate(
      dominant_arm_status = case_when(
        callable_fraction_arm < (1 - uncallable_threshold) ~ "uncallable",
        loss_fraction_arm > status_threshold ~ "loss",
        gain_fraction_arm > status_threshold ~ "gain",
        loss_fraction_arm + gain_fraction_arm > status_threshold ~ "unstable",
        TRUE ~ "neutral"
      )
    )

# Display table
consensus_seg_arm_status

Determine cytoband status

We want to include cytoband and gene-level calls for chromosome arms that have not been defined as a gain or loss to make the cytoband-level majority calls.

# Now define the cytoband as that status if more than the `status_threshold` 
# fraction value of the total counts are for that particular status
consensus_seg_cytoband_status <-
  consensus_seg_cytoband_status_df %>%
  mutate(
    dominant_cytoband_status = case_when(
      callable_fraction < (1 - uncallable_threshold) ~ "uncallable",
      loss_fraction > status_threshold ~ "loss",
      gain_fraction > status_threshold ~ "gain",
      loss_fraction + gain_fraction > status_threshold ~ "unstable",
      TRUE ~ "neutral"
    )
  ) 

# Join the consensus seg arm status data and filter to include only neutral
# chromosome arms and non-neutral cytobands
filtered_consensus_cytoband_status <- consensus_seg_cytoband_status %>%
  left_join(
    consensus_seg_arm_status,
    by = c(
      "Kids_First_Biospecimen_ID",
      "chromosome_arm"
    )
  ) %>%
  # Filter the annotated CN data to include only neutral chromosome arms and disagreements
  filter(
    dominant_arm_status %in% c("neutral", "uncallable", "unstable") |
      (
        dominant_cytoband_status != dominant_arm_status &
          # bands that disagree with arm, but are not neutral (or uncallable)
          !(dominant_cytoband_status %in% c("neutral", "uncallable", "unstable"))
      )
  ) %>%
  select(
    Kids_First_Biospecimen_ID,
    cytoband,
    loss_fraction,
    gain_fraction,
    callable_fraction,
    dominant_cytoband_status
  )

# Display table
filtered_consensus_cytoband_status

Determine gene-level status

# Create separate rows for genes span multiple cytobands
# CAUTION: This will require addition of a distinct() statment later to resolve duplicates
# Filtering to only multiband genes first because regexes are slow.
multi_band_genes <- consensus_seg_autosomes_df %>%
  filter(grepl("-", cytoband)) %>%
  extract(cytoband, into = c("chrom", "band"), regex = "([0-9]+)(.+)") %>% # make chrom and band cols
  separate_rows(band, sep = "-") %>% # duplicate rows with more than one band
  unite(cytoband, chrom, band, sep = "") # rejoin chrom and band

# Filter to singleband genes
single_band_genes <- consensus_seg_autosomes_df %>%
  filter(!grepl("-", cytoband))
gene_df <- bind_rows(single_band_genes, multi_band_genes)

# Now create a data.frame with the gene-level status calls for the
# neutral, uncallable, and unstable cytoband-level and chromosome arm-level
# calls
filtered_consensus_gene_status <- gene_df %>%
  left_join(
    consensus_seg_arm_status,
    by = c("biospecimen_id" = "Kids_First_Biospecimen_ID",
           "chromosome_arm")
  ) %>%
  left_join(
    consensus_seg_cytoband_status,
    by = c("biospecimen_id" = "Kids_First_Biospecimen_ID",
           "cytoband")
  ) %>%
  # Filter the annotated CN data to include only neutral arms and cytobands,
  # and disagreements
  filter(
    # Case 1) Gene call disagrees with both arm and cytoband (This captures most
    # we want to keep, including all of when arm and cytoband are neutral, uncallable
    # or unstable, since we have no neutral gene calls in this df):
    (
      status != dominant_arm_status & status != dominant_cytoband_status
    )
    # Case 2) Gene call disagrees with a non-neutral cytoband call.
    # Keep no matter what the arm status
    | (
      status != dominant_cytoband_status
      & dominant_cytoband_status %in% c("gain", "loss")
    )
    # I think that captures everything we want. Cases we don't want include:
    # gene & arm agree, but cytoband is neutral
    # gene & cytoband agree
    # all 3 agree
  ) %>%
  select(Kids_First_Biospecimen_ID = biospecimen_id,
         gene_symbol,
         status) %>%
  # The `distinct()` function is needed to remove duplicates resulting from
  # the band separation into multiple rows step above
  distinct()

# Display table
filtered_consensus_gene_status

Combine arm, cytoband, and gene-level status data

# Rename each the dominant status columns of the data.frames
# to be uniform for the binding rows step and filter out "neutral" calls
consensus_seg_arm_status <- consensus_seg_arm_status %>%
  filter(!(dominant_arm_status == "neutral")) %>%
  rename(dominant_status = dominant_arm_status)

filtered_consensus_cytoband_status <- filtered_consensus_cytoband_status %>%
  filter(!(dominant_cytoband_status == "neutral")) %>%
  rename(dominant_status = dominant_cytoband_status)

# There are no "neutral" calls at the gene level so we do not need to filter
# out those calls here
filtered_consensus_gene_status <- filtered_consensus_gene_status %>%
  rename(dominant_status = status)

# For each of the datasets we're joining, we'll only keep these columns:
cols_to_keep <- c("Kids_First_Biospecimen_ID", "dominant_status")

Combine into one long data frame

# Combine the arm, cytoband, and gene status count data
final_df <- bind_rows(
  arm = select(consensus_seg_arm_status,
               cols_to_keep,
               region = chromosome_arm),
  cytoband = select(filtered_consensus_cytoband_status,
                    cols_to_keep,
                    region = cytoband),
  gene = select(filtered_consensus_gene_status,
                cols_to_keep,
                region = gene_symbol),
  .id = "region_type"
) %>%
  # Reorder columns more sensibly
  select(Kids_First_Biospecimen_ID,
         status = dominant_status,
         region,
         region_type) %>%
  arrange(Kids_First_Biospecimen_ID)

# Print out preview
final_df

Write to a TSV file

# Write final long status table to file
write_tsv(
  final_df,
  file.path(results_dir, "consensus_seg_most_focal_cn_status.tsv.gz")
)

# Display final long status table
final_df %>%
  arrange(region_type)

Session Info

sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)

Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.3     purrr_0.3.2    
[5] readr_1.3.1     tidyr_0.8.3     tibble_2.1.3    ggplot2_3.2.0  
[9] tidyverse_1.2.1

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.1       cellranger_1.1.0 pillar_1.4.2     compiler_3.6.0  
 [5] base64enc_0.1-3  tools_3.6.0      digest_0.6.20    lubridate_1.7.4 
 [9] jsonlite_1.6     evaluate_0.14    nlme_3.1-140     gtable_0.3.0    
[13] lattice_0.20-38  pkgconfig_2.0.2  rlang_0.4.0      cli_1.1.0       
[17] rstudioapi_0.10  yaml_2.2.0       haven_2.1.1      xfun_0.8        
[21] withr_2.1.2      xml2_1.2.0       httr_1.4.0       knitr_1.23      
[25] hms_0.4.2        generics_0.0.2   grid_3.6.0       tidyselect_0.2.5
[29] glue_1.3.1       R6_2.4.0         readxl_1.3.1     rmarkdown_1.13  
[33] modelr_0.1.4     magrittr_1.5     backports_1.1.4  scales_1.0.0    
[37] htmltools_0.3.6  rvest_0.3.4      assertthat_0.2.1 colorspace_1.4-1
[41] stringi_1.4.3    lazyeval_0.2.2   munsell_0.5.0    broom_0.5.2     
[45] crayon_1.3.4    
---
title: "Find most focal recurrent copy number units"
output: 
  html_notebook:
    toc: TRUE
    toc_float: TRUE
author: Chante Bethell and Candace Savonen for ALSF CCDL
date: 2020
---

This notebook defines the most focal recurrent copy number units by removing focal changes that are within entire chromosome arm losses and gains.
_Most focal_ here meaning:

- If a chromosome arm is not clearly defined as a gain or loss (and is callable) we look to define the cytoband level status
- If a cytoband is not clearly defined as a gain or loss (and is callable) we then look to define the gene-level status

## Usage

This notebook is intended to be run from the command line with the following (assumes you are in the root directory of the repository):

```
Rscript -e "rmarkdown::render('analyses/focal-cn-file-preparation/05-define-most-focal-cn-units.Rmd', clean = TRUE)"
```

### Cutoffs: 

```{r}
# The fraction of calls a particular status needs to be
# above to be called the majority status -- the decision
# for a cutoff of 90% here was made to ensure that the status
# is not only the majority status but it is also significantly
# called more than the other status values in the region
status_threshold <- 0.9

# The fraction threshold for determining if enough of a region
# (arm, cytoband, or gene) is callable to determine its status --
# the decision for a cutoff of 50% here was made as it seems reasonable
# to expect a region to be more than 50% callable for a dominant status
# call to be made
uncallable_threshold <- 0.5
```

## Set up

### Libraries and functions

```{r}
library(tidyverse)
```

### Files and directories

```{r}
results_dir <- "results"
```

### Read in files

Read in cytoband status file and format it for what we will need in this notebook. 

```{r}
# Read in the file with consensus CN status data and the UCSC cytoband data --
# generated in `03-add-cytoband-status-consensus.Rmd`
consensus_seg_cytoband_status_df <-
  read_tsv(file.path("results", "consensus_seg_with_ucsc_cytoband_status.tsv.gz")) %>%
  # Need this to not have `chr`
  mutate(
    chr = gsub("chr", "", chr),
    cytoband = paste0(chr, cytoband)
  ) %>%
  select(
    chromosome_arm,
    # Distinguish this dominant status that is based on cytobands, from the status
    dominant_cytoband_status = dominant_status,
    cytoband,
    Kids_First_Biospecimen_ID,
    band_length,
    gain_fraction,
    loss_fraction,
    callable_fraction
  )
```

Read in the gene-level data. 

```{r}
# Read in the annotated gene level CN file
consensus_seg_autosomes_df <-
  read_tsv(file.path(results_dir, "consensus_seg_annotated_cn_autosomes.tsv.gz")) %>%
  mutate(chromosome_arm = gsub("(p|q).*", "\\1", cytoband))

# Rename "amplification" status calls to be "gain" for the purpose of this script
consensus_seg_autosomes_df$status <-
  gsub("amplification", "gain", consensus_seg_autosomes_df$status)
```

## Define most focal units

### Determine chromosome arm status

```{r}
consensus_seg_arm_status <- consensus_seg_cytoband_status_df %>%
    # Group by biospecimen ID and region
    group_by(Kids_First_Biospecimen_ID, chromosome_arm) %>%
    # Summarize the weighted means for each status
    summarize(
      loss_fraction_arm = weighted.mean(loss_fraction, band_length),
      gain_fraction_arm = weighted.mean(gain_fraction, band_length),
      callable_fraction_arm = weighted.mean(callable_fraction, band_length)
    ) %>%
    # Define dominant status based the weighted means meeting a status
    # threshold
    mutate(
      dominant_arm_status = case_when(
        callable_fraction_arm < (1 - uncallable_threshold) ~ "uncallable",
        loss_fraction_arm > status_threshold ~ "loss",
        gain_fraction_arm > status_threshold ~ "gain",
        loss_fraction_arm + gain_fraction_arm > status_threshold ~ "unstable",
        TRUE ~ "neutral"
      )
    )

# Display table
consensus_seg_arm_status
```

### Determine cytoband status

We want to include cytoband and gene-level calls for chromosome arms that have not been defined as a gain or loss to make the cytoband-level majority calls.

```{r}
# Now define the cytoband as that status if more than the `status_threshold` 
# fraction value of the total counts are for that particular status
consensus_seg_cytoband_status <-
  consensus_seg_cytoband_status_df %>%
  mutate(
    dominant_cytoband_status = case_when(
      callable_fraction < (1 - uncallable_threshold) ~ "uncallable",
      loss_fraction > status_threshold ~ "loss",
      gain_fraction > status_threshold ~ "gain",
      loss_fraction + gain_fraction > status_threshold ~ "unstable",
      TRUE ~ "neutral"
    )
  ) 

# Join the consensus seg arm status data and filter to include only neutral
# chromosome arms and non-neutral cytobands
filtered_consensus_cytoband_status <- consensus_seg_cytoband_status %>%
  left_join(
    consensus_seg_arm_status,
    by = c(
      "Kids_First_Biospecimen_ID",
      "chromosome_arm"
    )
  ) %>%
  # Filter the annotated CN data to include only neutral chromosome arms and disagreements
  filter(
    dominant_arm_status %in% c("neutral", "uncallable", "unstable") |
      (
        dominant_cytoband_status != dominant_arm_status &
          # bands that disagree with arm, but are not neutral (or uncallable)
          !(dominant_cytoband_status %in% c("neutral", "uncallable", "unstable"))
      )
  ) %>%
  select(
    Kids_First_Biospecimen_ID,
    cytoband,
    loss_fraction,
    gain_fraction,
    callable_fraction,
    dominant_cytoband_status
  )

# Display table
filtered_consensus_cytoband_status
```

### Determine gene-level status

```{r}
# Create separate rows for genes span multiple cytobands
# CAUTION: This will require addition of a distinct() statment later to resolve duplicates
# Filtering to only multiband genes first because regexes are slow.
multi_band_genes <- consensus_seg_autosomes_df %>%
  filter(grepl("-", cytoband)) %>%
  extract(cytoband, into = c("chrom", "band"), regex = "([0-9]+)(.+)") %>% # make chrom and band cols
  separate_rows(band, sep = "-") %>% # duplicate rows with more than one band
  unite(cytoband, chrom, band, sep = "") # rejoin chrom and band

# Filter to singleband genes
single_band_genes <- consensus_seg_autosomes_df %>%
  filter(!grepl("-", cytoband))
gene_df <- bind_rows(single_band_genes, multi_band_genes)

# Now create a data.frame with the gene-level status calls for the
# neutral, uncallable, and unstable cytoband-level and chromosome arm-level
# calls
filtered_consensus_gene_status <- gene_df %>%
  left_join(
    consensus_seg_arm_status,
    by = c("biospecimen_id" = "Kids_First_Biospecimen_ID",
           "chromosome_arm")
  ) %>%
  left_join(
    consensus_seg_cytoband_status,
    by = c("biospecimen_id" = "Kids_First_Biospecimen_ID",
           "cytoband")
  ) %>%
  # Filter the annotated CN data to include only neutral arms and cytobands,
  # and disagreements
  filter(
    # Case 1) Gene call disagrees with both arm and cytoband (This captures most
    # we want to keep, including all of when arm and cytoband are neutral, uncallable
    # or unstable, since we have no neutral gene calls in this df):
    (
      status != dominant_arm_status & status != dominant_cytoband_status
    )
    # Case 2) Gene call disagrees with a non-neutral cytoband call.
    # Keep no matter what the arm status
    | (
      status != dominant_cytoband_status
      & dominant_cytoband_status %in% c("gain", "loss")
    )
    # I think that captures everything we want. Cases we don't want include:
    # gene & arm agree, but cytoband is neutral
    # gene & cytoband agree
    # all 3 agree
  ) %>%
  select(Kids_First_Biospecimen_ID = biospecimen_id,
         gene_symbol,
         status) %>%
  # The `distinct()` function is needed to remove duplicates resulting from
  # the band separation into multiple rows step above
  distinct()

# Display table
filtered_consensus_gene_status
```

## Combine arm, cytoband, and gene-level status data

```{r}
# Rename each the dominant status columns of the data.frames
# to be uniform for the binding rows step and filter out "neutral" calls
consensus_seg_arm_status <- consensus_seg_arm_status %>%
  filter(!(dominant_arm_status == "neutral")) %>%
  rename(dominant_status = dominant_arm_status)

filtered_consensus_cytoband_status <- filtered_consensus_cytoband_status %>%
  filter(!(dominant_cytoband_status == "neutral")) %>%
  rename(dominant_status = dominant_cytoband_status)

# There are no "neutral" calls at the gene level so we do not need to filter
# out those calls here
filtered_consensus_gene_status <- filtered_consensus_gene_status %>%
  rename(dominant_status = status)

# For each of the datasets we're joining, we'll only keep these columns:
cols_to_keep <- c("Kids_First_Biospecimen_ID", "dominant_status")
```

Combine into one long data frame

```{r}
# Combine the arm, cytoband, and gene status count data
final_df <- bind_rows(
  arm = select(consensus_seg_arm_status,
               cols_to_keep,
               region = chromosome_arm),
  cytoband = select(filtered_consensus_cytoband_status,
                    cols_to_keep,
                    region = cytoband),
  gene = select(filtered_consensus_gene_status,
                cols_to_keep,
                region = gene_symbol),
  .id = "region_type"
) %>%
  # Reorder columns more sensibly
  select(Kids_First_Biospecimen_ID,
         status = dominant_status,
         region,
         region_type) %>%
  arrange(Kids_First_Biospecimen_ID)

# Print out preview
final_df
```

## Write to a TSV file 

```{r}
# Write final long status table to file
write_tsv(
  final_df,
  file.path(results_dir, "consensus_seg_most_focal_cn_status.tsv.gz")
)

# Display final long status table
final_df %>%
  arrange(region_type)
```

## Session Info

```{r}
sessionInfo()
```
